Inspiring by Deep EDA Tips video by yazaR-Data Science, I would like to practice those tips for my Policy Lapse analysis.

Policy Data

My Policy Lapse Data as below:

policy <-
readxl::read_xlsx("../data/PolicyLapse.xlsx")

policy <- 
  policy %>%
  janitor::remove_empty("cols") %>%
#  select(!where(is.logical),-c("PaymentTerm0","DistributionChannel0","PolicyYear")) %>%
  select(-c("ID","PaymentTerm0","DistributionChannel0","PolicyYear")) %>%
  mutate_if(is.character,factor) %>%
  mutate(Lapsed = fct_rev(Lapsed))

Summary policy data:

Data summary
Name policy
Number of rows 1341
Number of columns 18
_______________________
Column type frequency:
factor 10
numeric 8
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts pct
Lapsed 0 1.00 FALSE 2 Inf: 907, Lap: 434 Inf: 0.68, Lap: 0.32
PO_Sex 0 1.00 FALSE 2 mal: 709, fem: 632 mal: 0.53, fem: 0.47
PO_Married 0 1.00 FALSE 2 Mar: 802, Sin: 539 Mar: 0.60, Sin: 0.40
Occupation 0 1.00 FALSE 4 Grp: 486, Grp: 425, Grp: 263, Grp: 167 Grp: 0.36, Grp: 0.32, Grp: 0.20, Grp: 0.12
Phone_registered 12 0.99 FALSE 2 Yes: 935, No: 394 Yes: 0.70, No: 0.30
PO_is_INS 0 1.00 FALSE 2 No: 1158, Yes: 183 No: 0.86, Yes: 0.14
INS_Sex 0 1.00 FALSE 2 mal: 693, fem: 648 mal: 0.52, fem: 0.48
CoveragePeriod 0 1.00 FALSE 3 5-1: 617, >10: 412, 1-5: 312 5-1: 0.46, >10: 0.31, 1-5: 0.23
PaymentTerm 0 1.00 FALSE 4 Qua: 442, Ann: 421, Sem: 258, Mon: 220 Qua: 0.33, Ann: 0.31, Sem: 0.19, Mon: 0.16
DistributionChannel 0 1.00 FALSE 5 Com: 550, Ban: 401, Cor: 199, Gen: 126 Com: 0.41, Ban: 0.30, Cor: 0.15, Gen: 0.09, Oth: 0.05

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
NumOfReinstated 0 1 0.68 1.08 0 0 0 1 5 ▇▁▁▁▁
NumOfClaims 2 1 0.56 1.06 0 0 0 1 5 ▇▁▁▁▁
NumOfEmails 1 1 1.29 1.04 0 1 1 2 5 ▇▁▁▁▁
NumOfCalls 4 1 1.13 1.13 0 0 1 2 5 ▇▁▁▁▁
PO_Age 0 1 43.31 8.86 22 36 43 51 59 ▂▇▇▇▇
INS_Age 0 1 39.89 13.58 18 28 40 52 64 ▇▆▇▇▆
Premium 0 1 2825.38 2489.13 224 1025 2021 3761 12754 ▇▂▁▁▁
AgentYearSVR 0 1 2.11 1.06 1 1 2 3 6 ▇▂▁▁▁

Plot summary of all variables with DataExplorer

library(DataExplorer)
plot_intro(policy)

Summary tools

{summarytools}

summarytools library provides quick overview about the whole dataset.

library(summarytools)
dfSummary(policy)
## Data Frame Summary  
## policy  
## Dimensions: 1341 x 18  
## Duplicates: 0  
## 
## ----------------------------------------------------------------------------------------------------------------------
## No   Variable              Stats / Values            Freqs (% of Valid)     Graph                 Valid      Missing  
## ---- --------------------- ------------------------- ---------------------- --------------------- ---------- ---------
## 1    Lapsed                1. Lapsed                 434 (32.4%)            IIIIII                1341       0        
##      [factor]              2. Inforce                907 (67.6%)            IIIIIIIIIIIII         (100.0%)   (0.0%)   
## 
## 2    NumOfReinstated       Mean (sd) : 0.7 (1.1)     0 : 861 (64.2%)        IIIIIIIIIIII          1341       0        
##      [numeric]             min < med < max:          1 : 217 (16.2%)        III                   (100.0%)   (0.0%)   
##                            0 < 0 < 5                 2 : 140 (10.4%)        II                                        
##                            IQR (CV) : 1 (1.6)        3 :  88 ( 6.6%)        I                                         
##                                                      4 :  30 ( 2.2%)                                                  
##                                                      5 :   5 ( 0.4%)                                                  
## 
## 3    NumOfClaims           Mean (sd) : 0.6 (1.1)     0 : 962 (71.8%)        IIIIIIIIIIIIII        1339       2        
##      [numeric]             min < med < max:          1 : 148 (11.1%)        II                    (99.9%)    (0.1%)   
##                            0 < 0 < 5                 2 : 138 (10.3%)        II                                        
##                            IQR (CV) : 1 (1.9)        3 :  50 ( 3.7%)                                                  
##                                                      4 :  28 ( 2.1%)                                                  
##                                                      5 :  13 ( 1.0%)                                                  
## 
## 4    NumOfEmails           Mean (sd) : 1.3 (1)       0 : 221 (16.5%)        III                   1340       1        
##      [numeric]             min < med < max:          1 : 776 (57.9%)        IIIIIIIIIII           (99.9%)    (0.1%)   
##                            0 < 1 < 5                 2 : 140 (10.4%)        II                                        
##                            IQR (CV) : 1 (0.8)        3 : 158 (11.8%)        II                                        
##                                                      4 :  23 ( 1.7%)                                                  
##                                                      5 :  22 ( 1.6%)                                                  
## 
## 5    NumOfCalls            Mean (sd) : 1.1 (1.1)     0 : 424 (31.7%)        IIIIII                1337       4        
##      [numeric]             min < med < max:          1 : 576 (43.1%)        IIIIIIII              (99.7%)    (0.3%)   
##                            0 < 1 < 5                 2 : 157 (11.7%)        II                                        
##                            IQR (CV) : 2 (1)          3 : 127 ( 9.5%)        I                                         
##                                                      4 :  31 ( 2.3%)                                                  
##                                                      5 :  22 ( 1.6%)                                                  
## 
## 6    PO_Age                Mean (sd) : 43.3 (8.9)    35 distinct values         . : . : .         1341       0        
##      [numeric]             min < med < max:                                     : : : : :         (100.0%)   (0.0%)   
##                            22 < 43 < 59                                       . : : : : : :                           
##                            IQR (CV) : 15 (0.2)                                : : : : : : :                           
##                                                                               : : : : : : :                           
## 
## 7    PO_Sex                1. female                 632 (47.1%)            IIIIIIIII             1341       0        
##      [factor]              2. male                   709 (52.9%)            IIIIIIIIII            (100.0%)   (0.0%)   
## 
## 8    PO_Married            1. Married                802 (59.8%)            IIIIIIIIIII           1341       0        
##      [factor]              2. Single                 539 (40.2%)            IIIIIIII              (100.0%)   (0.0%)   
## 
## 9    Occupation            1. Grp_1                  263 (19.6%)            III                   1341       0        
##      [factor]              2. Grp_2                  486 (36.2%)            IIIIIII               (100.0%)   (0.0%)   
##                            3. Grp_3                  425 (31.7%)            IIIIII                                    
##                            4. Grp_4                  167 (12.5%)            II                                        
## 
## 10   Phone_registered      1. No                     394 (29.6%)            IIIII                 1329       12       
##      [factor]              2. Yes                    935 (70.4%)            IIIIIIIIIIIIII        (99.1%)    (0.9%)   
## 
## 11   PO_is_INS             1. No                     1158 (86.4%)           IIIIIIIIIIIIIIIII     1341       0        
##      [factor]              2. Yes                     183 (13.6%)           II                    (100.0%)   (0.0%)   
## 
## 12   INS_Age               Mean (sd) : 39.9 (13.6)   47 distinct values     :                     1341       0        
##      [numeric]             min < med < max:                                 :     : :   :   .     (100.0%)   (0.0%)   
##                            18 < 40 < 64                                     : : . : : : : : : :                       
##                            IQR (CV) : 24 (0.3)                              : : : : : : : : : :                       
##                                                                             : : : : : : : : : :                       
## 
## 13   INS_Sex               1. female                 648 (48.3%)            IIIIIIIII             1341       0        
##      [factor]              2. male                   693 (51.7%)            IIIIIIIIII            (100.0%)   (0.0%)   
## 
## 14   Premium               Mean (sd) : 2825 (2489)   1126 distinct values   :                     1341       0        
##      [numeric]             min < med < max:                                 : :                   (100.0%)   (0.0%)   
##                            224 < 2021 < 12754                               : :                                       
##                            IQR (CV) : 2736 (0.9)                            : : .                                     
##                                                                             : : : : . . . .                           
## 
## 15   CoveragePeriod        1. >10 yrs                412 (30.7%)            IIIIII                1341       0        
##      [factor]              2. 1-5 yrs                312 (23.3%)            IIII                  (100.0%)   (0.0%)   
##                            3. 5-10 yrs               617 (46.0%)            IIIIIIIII                                 
## 
## 16   PaymentTerm           1. Annualy                421 (31.4%)            IIIIII                1341       0        
##      [factor]              2. Monthly                220 (16.4%)            III                   (100.0%)   (0.0%)   
##                            3. Quartely               442 (33.0%)            IIIIII                                    
##                            4. Semi annual            258 (19.2%)            III                                       
## 
## 17   DistributionChannel   1. Bancas                 401 (29.9%)            IIIII                 1341       0        
##      [factor]              2. Company Agent          550 (41.0%)            IIIIIIII              (100.0%)   (0.0%)   
##                            3. Corp                   199 (14.8%)            II                                        
##                            4. General Agency         126 ( 9.4%)            I                                         
##                            5. Others PD               65 ( 4.8%)                                                      
## 
## 18   AgentYearSVR          Mean (sd) : 2.1 (1.1)     1 : 422 (31.5%)        IIIIII                1341       0        
##      [numeric]             min < med < max:          2 : 518 (38.6%)        IIIIIII               (100.0%)   (0.0%)   
##                            1 < 2 < 6                 3 : 307 (22.9%)        IIII                                      
##                            IQR (CV) : 2 (0.5)        4 :  40 ( 3.0%)                                                  
##                                                      5 :  32 ( 2.4%)                                                  
##                                                      6 :  22 ( 1.6%)                                                  
## ----------------------------------------------------------------------------------------------------------------------

{gtsummary}

gtsummary: creates a Presentation-Ready Data Summary and Analytic Result Tables

library(gtsummary)

mtcars %>%
  select(mpg, hp, am, gear, cyl) %>%
  tbl_summary(by = am) %>%
  add_p()
Characteristic 0, N = 191 1, N = 131 p-value2
mpg 17.3 (14.9, 19.2) 22.8 (21.0, 30.4) 0.002
hp 175 (116, 192) 109 (66, 113) 0.046
gear <0.001
    3 15 (79%) 0 (0%)
    4 4 (21%) 8 (62%)
    5 0 (0%) 5 (38%)
cyl 0.009
    4 3 (16%) 8 (62%)
    6 4 (21%) 3 (23%)
    8 12 (63%) 2 (15%)
1 Median (IQR); n (%)
2 Wilcoxon rank sum test; Fisher's exact test
  • numeric variables are used non-parametric Wilcoxon rank sum test for comparing two groups.
  • categorical variables are checked with Fisher’s exact test if number of observations are small (such as observations in any of the group is below 5)
ISLR::Wage %>%
  select(age, wage, education, jobclass) %>%
  tbl_summary(by=education) %>%
  add_p()
Characteristic 1. < HS Grad, N = 2681 2. HS Grad, N = 9711 3. Some College, N = 6501 4. College Grad, N = 6851 5. Advanced Degree, N = 4261 p-value2
age 42 (33, 50) 42 (33, 50) 40 (32, 49) 43 (34, 51) 44 (38, 53) <0.001
wage 81 (70, 97) 94 (78, 110) 105 (89, 121) 119 (100, 143) 142 (117, 171) <0.001
jobclass <0.001
    1. Industrial 190 (71%) 636 (65%) 342 (53%) 274 (40%) 102 (24%)
    2. Information 78 (29%) 335 (35%) 308 (47%) 411 (60%) 324 (76%)
1 Median (IQR); n (%)
2 Kruskal-Wallis rank sum test; Pearson's Chi-squared test
  • The non-parametric Kruskall- Wallis rank sum test is used for test numeric variables for more than two groups.
  • Pearson chi-square test is used to check categorical variable with more data
policy %>%
  select(Lapsed, starts_with("PO"), starts_with("Num"),Occupation, PaymentTerm, Premium) %>%
  tbl_summary(by = Lapsed,
              statistic = list(all_continuous() ~ "{mean} ({sd})",
                               all_categorical() ~ "{p}% ({n} / {N})")
  ) %>%
  add_p()
Characteristic Lapsed, N = 4341 Inforce, N = 9071 p-value2
PO_Age 43 (9) 44 (9) 0.050
PO_Sex 0.3
    female 49% (213 / 434) 46% (419 / 907)
    male 51% (221 / 434) 54% (488 / 907)
PO_Married 0.2
    Married 62% (270 / 434) 59% (532 / 907)
    Single 38% (164 / 434) 41% (375 / 907)
PO_is_INS 18% (76 / 434) 12% (107 / 907) 0.004
NumOfReinstated
    0 53% (230 / 434) 70% (631 / 907)
    1 15% (67 / 434) 17% (150 / 907)
    2 14% (61 / 434) 8.7% (79 / 907)
    3 13% (55 / 434) 3.6% (33 / 907)
    4 3.9% (17 / 434) 1.4% (13 / 907)
    5 0.9% (4 / 434) 0.1% (1 / 907)
NumOfClaims
    0 70% (303 / 434) 73% (659 / 905)
    1 11% (47 / 434) 11% (101 / 905)
    2 10% (44 / 434) 10% (94 / 905)
    3 5.1% (22 / 434) 3.1% (28 / 905)
    4 3.5% (15 / 434) 1.4% (13 / 905)
    5 0.7% (3 / 434) 1.1% (10 / 905)
    Unknown 0 2
NumOfEmails 0.3
    0 18% (77 / 434) 16% (144 / 906)
    1 59% (254 / 434) 58% (522 / 906)
    2 12% (51 / 434) 9.8% (89 / 906)
    3 9.2% (40 / 434) 13% (118 / 906)
    4 1.2% (5 / 434) 2.0% (18 / 906)
    5 1.6% (7 / 434) 1.7% (15 / 906)
    Unknown 0 1
NumOfCalls 0.13
    0 35% (152 / 432) 30% (272 / 905)
    1 44% (190 / 432) 43% (386 / 905)
    2 10% (44 / 432) 12% (113 / 905)
    3 8.1% (35 / 432) 10% (92 / 905)
    4 1.4% (6 / 432) 2.8% (25 / 905)
    5 1.2% (5 / 432) 1.9% (17 / 905)
    Unknown 2 2
Occupation <0.001
    Grp_1 31% (133 / 434) 14% (130 / 907)
    Grp_2 43% (187 / 434) 33% (299 / 907)
    Grp_3 22% (95 / 434) 36% (330 / 907)
    Grp_4 4.4% (19 / 434) 16% (148 / 907)
PaymentTerm <0.001
    Annualy 23% (101 / 434) 35% (320 / 907)
    Monthly 21% (90 / 434) 14% (130 / 907)
    Quartely 36% (155 / 434) 32% (287 / 907)
    Semi annual 20% (88 / 434) 19% (170 / 907)
Premium 4,619 (3,039) 1,967 (1,573) <0.001
1 Mean (SD); % (n / N)
2 Wilcoxon rank sum test; Pearson's Chi-squared test
policy %>%
  tbl_summary(by = Lapsed) %>%
  add_p()
Characteristic Lapsed, N = 4341 Inforce, N = 9071 p-value2
NumOfReinstated
    0 230 (53%) 631 (70%)
    1 67 (15%) 150 (17%)
    2 61 (14%) 79 (8.7%)
    3 55 (13%) 33 (3.6%)
    4 17 (3.9%) 13 (1.4%)
    5 4 (0.9%) 1 (0.1%)
NumOfClaims
    0 303 (70%) 659 (73%)
    1 47 (11%) 101 (11%)
    2 44 (10%) 94 (10%)
    3 22 (5.1%) 28 (3.1%)
    4 15 (3.5%) 13 (1.4%)
    5 3 (0.7%) 10 (1.1%)
    Unknown 0 2
NumOfEmails 0.3
    0 77 (18%) 144 (16%)
    1 254 (59%) 522 (58%)
    2 51 (12%) 89 (9.8%)
    3 40 (9.2%) 118 (13%)
    4 5 (1.2%) 18 (2.0%)
    5 7 (1.6%) 15 (1.7%)
    Unknown 0 1
NumOfCalls 0.13
    0 152 (35%) 272 (30%)
    1 190 (44%) 386 (43%)
    2 44 (10%) 113 (12%)
    3 35 (8.1%) 92 (10%)
    4 6 (1.4%) 25 (2.8%)
    5 5 (1.2%) 17 (1.9%)
    Unknown 2 2
PO_Age 42 (35, 50) 44 (36, 51) 0.050
PO_Sex 0.3
    female 213 (49%) 419 (46%)
    male 221 (51%) 488 (54%)
PO_Married 0.2
    Married 270 (62%) 532 (59%)
    Single 164 (38%) 375 (41%)
Occupation <0.001
    Grp_1 133 (31%) 130 (14%)
    Grp_2 187 (43%) 299 (33%)
    Grp_3 95 (22%) 330 (36%)
    Grp_4 19 (4.4%) 148 (16%)
Phone_registered 302 (70%) 633 (70%) >0.9
    Unknown 4 8
PO_is_INS 76 (18%) 107 (12%) 0.004
INS_Age 39 (29, 50) 41 (28, 52) 0.3
INS_Sex 0.006
    female 186 (43%) 462 (51%)
    male 248 (57%) 445 (49%)
Premium 4,040 (1,934, 7,449) 1,662 (876, 2,446) <0.001
CoveragePeriod <0.001
    >10 yrs 170 (39%) 242 (27%)
    1-5 yrs 60 (14%) 252 (28%)
    5-10 yrs 204 (47%) 413 (46%)
PaymentTerm <0.001
    Annualy 101 (23%) 320 (35%)
    Monthly 90 (21%) 130 (14%)
    Quartely 155 (36%) 287 (32%)
    Semi annual 88 (20%) 170 (19%)
DistributionChannel 0.040
    Bancas 110 (25%) 291 (32%)
    Company Agent 190 (44%) 360 (40%)
    Corp 60 (14%) 139 (15%)
    General Agency 47 (11%) 79 (8.7%)
    Others PD 27 (6.2%) 38 (4.2%)
AgentYearSVR 0.042
    1 142 (33%) 280 (31%)
    2 186 (43%) 332 (37%)
    3 84 (19%) 223 (25%)
    4 11 (2.5%) 29 (3.2%)
    5 8 (1.8%) 24 (2.6%)
    6 3 (0.7%) 19 (2.1%)
1 n (%); Median (IQR)
2 Pearson's Chi-squared test; Wilcoxon rank sum test

Explore Categorical Variables with Fisher’s and Chi-Square

DataExplorer

Plot categorical variable with DataExplorer

plot_bar(policy)

plot_bar(policy, by="Lapsed")

SmartEDA

ExpCatViz()

library(SmartEDA)
library("viridis")
ExpCatViz(
  policy %>%
    select(Lapsed,Occupation) %>%
    filter(Lapsed == "Lapsed"),
  target = "Lapsed",
 col=hcl.colors(1, "Dark2"),
)
## [[1]]

ExpCatViz(
  policy %>%
    select(Lapsed,Occupation) ,
  target = "Lapsed",
 # col=hcl.colors(2, "Dark2", alpha = 0.8)
  # col = rainbow(2, alpha = 0.7)
 col = viridis(2, alpha = 0.7)
# col=hcl.colors(2, "viridis", alpha = 0.8)
)
## [[1]]

ExpCatStat()

ExpCatStat() function provides summary statistics for all character or categorical columns in the dataframe.

ExpCatStat(policy,
           Target = "Lapsed") %>%
  flextable()

Variable

Target

Unique

Chi-squared

p-value

df

IV Value

Cramers V

Degree of Association

Predictive Power

PO_Sex

Lapsed

2

0.979

0.349

0

0.03

Very Weak

Not Predictive

PO_Married

Lapsed

2

1.545

0.230

0

0.03

Very Weak

Not Predictive

Occupation

Lapsed

4

101.184

0.000

0

0.27

Moderate

Not Predictive

Phone_registered

Lapsed

3

0.004

0.946

0

0.00

Very Weak

Not Predictive

PO_is_INS

Lapsed

2

8.134

0.005

0

0.08

Very Weak

Not Predictive

INS_Sex

Lapsed

2

7.674

0.005

0

0.08

Very Weak

Not Predictive

CoveragePeriod

Lapsed

3

39.625

0.000

0

0.17

Weak

Not Predictive

PaymentTerm

Lapsed

4

22.659

0.000

0

0.13

Weak

Not Predictive

DistributionChannel

Lapsed

5

10.001

0.044

0

0.09

Weak

Not Predictive

NumOfReinstated

Lapsed

6

70.601

0.000

0

0.23

Moderate

Not Predictive

NumOfClaims

Lapsed

7

9.719

0.073

0

0.09

Weak

Not Predictive

NumOfEmails

Lapsed

7

6.496

0.266

0

0.07

Very Weak

Not Predictive

NumOfCalls

Lapsed

7

8.480

0.144

0

0.08

Very Weak

Not Predictive

AgentYearSVR

Lapsed

6

11.549

0.043

0

0.09

Weak

Not Predictive

PO_Age

Lapsed

10

12.523

0.186

0

0.10

Weak

Not Predictive

INS_Age

Lapsed

10

16.114

0.057

0

0.11

Weak

Not Predictive

Premium

Lapsed

10

388.714

0.000

0

0.54

Strong

Not Predictive

  • Chi-squared: χ2

  • p-value: : with p-value <.05, we reject the null hypothesis and accept that two variables are relates each other. We then select those variavle as predictors in prediction.

  • IV Value: Information Value helps determine which columns in a data set have predictive power or influence on the value of a specified dependent variable.

  • Cramers V : Cramers V is a measure of the strength of association between two nominal variables.

  • Predictive Power :

{ggstatsplot}

Visualization with statistic with ggstatsplot

  • It counts and calculates percentages for every category.
  • It visualizes the frequency table in the form of stacked bars and provides the statistical details including: p-value in addition to visualization which allows us to make a conclusion or inference
library(ggstatsplot)
policy %>%
  ggbarstats(x= Lapsed, y = DistributionChannel,
             label = "both")

Explore Numeric Variables with Descriptive Statistic

{dlookr}

library(dlookr)
library(flextable)
options(digits = 3)
dlookr::describe(policy) %>%
  flextable()

described_variables

n

na

mean

sd

se_mean

IQR

skewness

kurtosis

p00

p01

p05

p10

p20

p25

p30

p40

p50

p60

p70

p75

p80

p90

p95

p99

p100

NumOfReinstated

1,341

0

0.676

1.08

0.0295

1

1.5909

1.73

0

0

0

0

0

0

0

0

0

0

1

1

1

2

3

4

5

NumOfClaims

1,339

2

0.561

1.06

0.0290

1

2.0494

3.80

0

0

0

0

0

0

0

0

0

0

0

1

1

2

3

4

5

NumOfEmails

1,340

1

1.293

1.04

0.0283

1

1.2893

1.81

0

0

0

0

1

1

1

1

1

1

1

2

2

3

3

5

5

NumOfCalls

1,337

4

1.126

1.13

0.0308

2

1.2290

1.37

0

0

0

0

0

0

0

1

1

1

1

2

2

3

3

5

5

PO_Age

1,341

0

43.310

8.86

0.2419

15

-0.0078

-1.16

22

28

30

31

34

36

37

40

43

47

49

51

52

56

57

59

59

INS_Age

1,341

0

39.894

13.58

0.3709

24

-0.0136

-1.16

18

18

19

20

26

28

31

35

40

45

49

52

53

58

61

64

64

Premium

1,341

0

2,825.378

2,489.13

67.9725

2,736

1.3622

1.07

224

251

354

491

852

1,025

1,226

1,623

2,021

2,426

2,980

3,761

4,440

7,280

8,422

9,708

12,754

AgentYearSVR

1,341

0

2.111

1.06

0.0290

2

1.2285

2.06

1

1

1

1

1

1

1

2

2

2

2

3

3

3

4

6

6

We can add Classification group or control & treatment

policy %>%
  group_by(Lapsed) %>%
  dlookr::univar_numeric()
## $statistics
## # A tibble: 16 × 10
##    described_v…¹ Lapsed     n    na    mean      sd se_mean
##    <chr>         <fct>  <int> <int>   <dbl>   <dbl>   <dbl>
##  1 AgentYearSVR  Lapsed   434     0 2   e+0 9.49e-1 4.56e-2
##  2 AgentYearSVR  Infor…   907     0 2.16e+0 1.11e+0 3.68e-2
##  3 INS_Age       Lapsed   434     0 3.94e+1 1.32e+1 6.33e-1
##  4 INS_Age       Infor…   907     0 4.01e+1 1.38e+1 4.57e-1
##  5 NumOfCalls    Lapsed   432     2 1   e+0 1.04e+0 5.02e-2
##  6 NumOfCalls    Infor…   905     2 1.19e+0 1.16e+0 3.86e-2
##  7 NumOfClaims   Lapsed   434     0 6.36e-1 1.14e+0 5.46e-2
##  8 NumOfClaims   Infor…   905     2 5.25e-1 1.02e+0 3.39e-2
##  9 NumOfEmails   Lapsed   434     0 1.22e+0 9.89e-1 4.75e-2
## 10 NumOfEmails   Infor…   906     1 1.33e+0 1.06e+0 3.51e-2
## 11 NumOfReinsta… Lapsed   434     0 1.02e+0 1.30e+0 6.22e-2
## 12 NumOfReinsta… Infor…   907     0 5.12e-1 9.15e-1 3.04e-2
## 13 PO_Age        Lapsed   434     0 4.26e+1 8.75e+0 4.20e-1
## 14 PO_Age        Infor…   907     0 4.36e+1 8.89e+0 2.95e-1
## 15 Premium       Lapsed   434     0 4.62e+3 3.04e+3 1.46e+2
## 16 Premium       Infor…   907     0 1.97e+3 1.57e+3 5.22e+1
## # … with 3 more variables: IQR <dbl>, skewness <dbl>,
## #   median <dbl>, and abbreviated variable name
## #   ¹​described_variables

dianose_numeric() function report the usual five number summary which is actually box-plot form, and number of zeros, negative values and outliers:

policy %>%
  diagnose_numeric() %>%
  flextable()

variables

min

Q1

mean

median

Q3

max

zero

minus

outlier

NumOfReinstated

0

0

0.676

0

1

5

861

0

123

NumOfClaims

0

0

0.561

0

1

5

962

0

91

NumOfEmails

0

1

1.293

1

2

5

221

0

45

NumOfCalls

0

0

1.126

1

2

5

424

0

0

PO_Age

22

36

43.310

43

51

59

0

0

0

INS_Age

18

28

39.894

40

52

64

0

0

0

Premium

224

1,025

2,825.378

2,021

3,761

12,754

0

0

101

AgentYearSVR

1

1

2.111

2

3

6

0

0

0

{SmartEDA}

SmartEDA for numeric variable with ExpNumStart() function provides the richest and most comprehensive descriptive statistic table. We can display whole variables, grouped variables or even both.

# Summary statistic for All
ExpNumStat(policy, by ="A") %>% 
  flextable()

Vname

Group

TN

nNeg

nZero

nPos

NegInf

PosInf

NA_Value

Per_of_Missing

sum

min

max

mean

median

SD

CV

IQR

Skewness

Kurtosis

INS_Age

All

1,341

0

0

1,341

0

0

0

0

53,498

18

64

39.9

40

13.58

0.340

24

-0.014

-1.16

PO_Age

All

1,341

0

0

1,341

0

0

0

0

58,079

22

59

43.3

43

8.86

0.205

15

-0.008

-1.16

Premium

All

1,341

0

0

1,341

0

0

0

0

3,788,832

224

12,754

2,825.4

2,021

2,489.13

0.881

2,736

1.361

1.06

# Group summary statistic by "gp"
ExpNumStat(policy, by ="G", gp = "Lapsed") %>%
  flextable()

Vname

Group

TN

nNeg

nZero

nPos

NegInf

PosInf

NA_Value

Per_of_Missing

sum

min

max

mean

median

SD

CV

IQR

Skewness

Kurtosis

INS_Age

Lapsed:Lapsed

434

0

0

434

0

0

0

0

17,088

18

64

39.4

39

13.19

0.335

20.8

0.032

-1.03

INS_Age

Lapsed:Inforce

907

0

0

907

0

0

0

0

36,410

18

64

40.1

41

13.76

0.343

24.0

-0.037

-1.22

PO_Age

Lapsed:Lapsed

434

0

0

434

0

0

0

0

18,499

26

59

42.6

42

8.75

0.205

15.0

0.100

-1.18

PO_Age

Lapsed:Inforce

907

0

0

907

0

0

0

0

39,580

22

59

43.6

44

8.89

0.204

15.0

-0.060

-1.14

Premium

Lapsed:Lapsed

434

0

0

434

0

0

0

0

2,004,511

249

12,754

4,618.7

4,040

3,038.72

0.658

5,514.5

0.305

-1.07

Premium

Lapsed:Inforce

907

0

0

907

0

0

0

0

1,784,321

224

9,600

1,967.3

1,662

1,572.56

0.799

1,571.0

1.913

4.56

# Both Summary and Group
ExpNumStat(policy, by ="GA", gp = "Lapsed", Outlier = TRUE,
           Qnt = c(.25,.75), round = 2) %>%
  flextable()

Vname

Group

TN

nNeg

nZero

nPos

NegInf

PosInf

NA_Value

Per_of_Missing

sum

min

max

mean

median

SD

CV

IQR

Skewness

Kurtosis

25%

75%

LB.25%

UB.75%

nOutliers

INS_Age

Lapsed:All

1,341

0

0

1,341

0

0

0

0

53,498

18

64

39.9

40

13.58

0.34

24.0

-0.01

-1.16

28.0

52

-8.00

88.0

0

INS_Age

Lapsed:Lapsed

434

0

0

434

0

0

0

0

17,088

18

64

39.4

39

13.19

0.34

20.8

0.03

-1.03

29.2

50

-1.88

81.1

0

INS_Age

Lapsed:Inforce

907

0

0

907

0

0

0

0

36,410

18

64

40.1

41

13.76

0.34

24.0

-0.04

-1.22

28.0

52

-8.00

88.0

0

PO_Age

Lapsed:All

1,341

0

0

1,341

0

0

0

0

58,079

22

59

43.3

43

8.86

0.20

15.0

-0.01

-1.16

36.0

51

13.50

73.5

0

PO_Age

Lapsed:Lapsed

434

0

0

434

0

0

0

0

18,499

26

59

42.6

42

8.75

0.21

15.0

0.10

-1.18

35.0

50

12.50

72.5

0

PO_Age

Lapsed:Inforce

907

0

0

907

0

0

0

0

39,580

22

59

43.6

44

8.89

0.20

15.0

-0.06

-1.14

36.0

51

13.50

73.5

0

Premium

Lapsed:All

1,341

0

0

1,341

0

0

0

0

3,788,832

224

12,754

2,825.4

2,021

2,489.13

0.88

2,736.0

1.36

1.06

1,025.0

3,761

-3,079.00

7,865.0

101

Premium

Lapsed:Lapsed

434

0

0

434

0

0

0

0

2,004,511

249

12,754

4,618.7

4,040

3,038.72

0.66

5,514.5

0.30

-1.07

1,934.2

7,449

-6,337.50

15,720.5

0

Premium

Lapsed:Inforce

907

0

0

907

0

0

0

0

1,784,321

224

9,600

1,967.3

1,662

1,572.56

0.80

1,571.0

1.91

4.56

875.5

2,446

-1,481.00

4,803.0

63

Explore distribution with Skewness and Kurtosis test

Many statistical tests depend on symmetric and normally distributed data

{DataExplorer}

library(DataExplorer)
library(ISLR) # for Wage dataset
plot_histogram(Wage)

plot_density(Wage)

Histogram and density plots allow us the first claims on the data.

policy %>% select(-starts_with("Num")) %>%
  plot_histogram()

policy %>% select(-starts_with("Num")) %>%
  plot_density()

Skewness and Kurtosis

The symmetry can be described by Skewness and Kurtosis measures:

  • Skewness tells us whether the distribution is symmetric or not. it also provide if

    • it’s skewed to the left (skewness <0), or
    • it’s skewed to the right (skewness >0)
  • Kurtosis tells us how far the outliers and heavy tail of distribution


airquality %>%
  select(Ozone, Wind) %>%
  plot_density()

skewness(airquality$Ozone, na.rm = T)
## [1] 1.23
skewness(airquality$Wind, na.rm = T)
## [1] 0.344
  • The skewness of Ozone distribution equals rskewness(airquality$Ozone, na.rm = T)` - FAR AWAY FROM ZERO, suggests Ozone is not Normally distributed.

  • The skewness of Wind distribution equals rskewness(airquality$Wind, na.rm = T)` - IS THIS FAR FAR ENOUGH FROM ZERO ???

We will run the D’Agostino skewness test from moments library as below

library(moments)
agostino.test(airquality$Ozone)
## 
##  D'Agostino skewness test
## 
## data:  airquality$Ozone
## skew = 1, z = 5, p-value = 3e-06
## alternative hypothesis: data have a skewness

The p-value for Ozone is very small which reject the null hypothesis about not skewed data. Saying that all zone data is actually significant skewed or NOT NORMALLY DISTRIBUTED!.

agostino.test(airquality$Wind)
## 
##  D'Agostino skewness test
## 
## data:  airquality$Wind
## skew = 0.3, z = 1.8, p-value = 0.08
## alternative hypothesis: data have a skewness

The p-value for Wind is above the usual significant threshold (>.05) which canot reject the null hypothesis as not skewed data. Means that we can treat Wind data as not skewed and therefor NORMALLY DISTRIBUTED.

Kurtosis

The Kurtosis is the measure of heavy tails or outliers present in the distribution.

The Kurtosis value for a normal distribution is around 3.

The Anscombe-Glynn anscombe.test() provides statistical test for whehter kurtosis around 3.

anscombe.test(airquality$Ozone)
## 
##  Anscombe-Glynn kurtosis test
## 
## data:  airquality$Ozone
## kurt = 4, z = 2, p-value = 0.03
## alternative hypothesis: kurtosis is not equal to 3
anscombe.test(airquality$Wind)
## 
##  Anscombe-Glynn kurtosis test
## 
## data:  airquality$Wind
## kurt = 3.1, z = 0.4, p-value = 0.7
## alternative hypothesis: kurtosis is not equal to 3
  • The kurtosis for Ozone is 4.1 with p-value (<.05) which is significant far away from 3, indicating that NOT NORMALLY DISTRIBUTED & PROBABLE PRESENCE OF OUTLIERS

  • The kurtosis for Win is around 3 but p-value(>.05) tell us that Wind distribution is NORMALLY DISTRIBUTED ! NO OUTLIERS.

Explore Normality with QQ-Plots and Shapiro-Wilk.

The normality check helps us to determine a correct statistical test:

Data Test For 2 Groups For > 2 Groups
Normally Distributed Parametric Test T-Test ANOVA
Not Normally Distributed Non-Parametric Mann-Whitney Krusal-Wallis

Checking the normality using QQ-Plots and Shapiro-Wilk

{DataExplorer}

plot_qq(iris)

plot_qq(iris, by = "Species")

policy %>%
  select(PO_Age,INS_Age, Premium) %>%
  plot_qq()

plot_qq(policy, by="Lapsed")

{dlookr} visualization

With dlookr, we not only view Q-Q plot but also histogram of the original and histograms of two most common transformation of data: log and sqrt transformation. It helps us to see whether transformation improves something or not

iris %>%
  group_by(Species) %>%
  plot_normality(Petal.Length)

policy %>%
  group_by(Lapsed) %>%
  plot_normality(Premium)

policy %>%
  plot_normality(Premium)

The Q-Q plots can be interpreted : points are situated close to the diagonal line - the data is probably normally distributed.

But how close is enough to conclude it’s actually normally distributed.

{ggqqplot}

ggqqplot helps us to decide whether the deviation from normality is distributed.

library(ggpubr)
ggqqplot(iris, "Sepal.Length", facet.by = "Species")

ggqqplot(policy,"Premium", facet.by = "Lapsed")

{dloor} Shapiro-Wilk normality test

agostino.test(airquality$Ozone) # Ozone is not normally distributed
## 
##  D'Agostino skewness test
## 
## data:  airquality$Ozone
## skew = 1, z = 5, p-value = 3e-06
## alternative hypothesis: data have a skewness
agostino.test(airquality$Wind) # Wind is normally distributed
## 
##  D'Agostino skewness test
## 
## data:  airquality$Wind
## skew = 0.3, z = 1.8, p-value = 0.08
## alternative hypothesis: data have a skewness
normality(airquality) %>%
  mutate_if(is.numeric,~round(.,3)) %>%
  flextable()

vars

statistic

p_value

sample

Ozone

0.879

0.000

153

Solar.R

0.942

0.000

153

Wind

0.986

0.118

153

Temp

0.976

0.009

153

Month

0.888

0.000

153

Day

0.953

0.000

153

The null-hypothesis of this Shapiro-Wilk test is that the population is normally distributed: - If p value < alpha level, the null hypothesis is rejected and there is evidence that the data tested are not normally distributed. - If p value > .05, then the null hypothesis (that the data came from a normally distributed population) can not be rejected.

With p_value > .05, it shows that only Wind data is normally distributed.

normality(policy) %>%
  mutate_if(is.numeric,~round(.,3)) %>%
  flextable()

vars

statistic

p_value

sample

NumOfReinstated

0.675

0

1,341

NumOfClaims

0.596

0

1,341

NumOfEmails

0.789

0

1,341

NumOfCalls

0.817

0

1,341

PO_Age

0.959

0

1,341

INS_Age

0.955

0

1,341

Premium

0.836

0

1,341

AgentYearSVR

0.824

0

1,341

With p_value <.05, all numeric data are not normally distributed.

In conjunction with group_by(), we can check the normality check for subset of data

diamonds %>%
  group_by(cut,color,clarity) %>%
  normality()
## # A tibble: 1,932 × 7
##    variable cut   color clarity statistic    p_value sample
##    <chr>    <ord> <ord> <ord>       <dbl>      <dbl>  <dbl>
##  1 carat    Fair  D     I1          0.888    3.73e-1      4
##  2 carat    Fair  D     SI2         0.867    1.83e-5     56
##  3 carat    Fair  D     SI1         0.849    3.79e-6     58
##  4 carat    Fair  D     VS2         0.931    9.20e-2     25
##  5 carat    Fair  D     VS1         0.973    8.93e-1      5
##  6 carat    Fair  D     VVS2        0.871    1.27e-1      9
##  7 carat    Fair  D     VVS1        0.931    4.93e-1      3
##  8 carat    Fair  D     IF          0.990    8.06e-1      3
##  9 carat    Fair  E     I1          0.941    5.96e-1      9
## 10 carat    Fair  E     SI2         0.867    8.07e-7     78
## # … with 1,922 more rows
policy %>%
  group_by(PO_Sex,Occupation) %>%
  normality()
## # A tibble: 64 × 6
##    variable        PO_Sex Occupat…¹ stati…²  p_value sample
##    <chr>           <fct>  <fct>       <dbl>    <dbl>  <dbl>
##  1 NumOfReinstated female Grp_1       0.651 6.82e-16    126
##  2 NumOfReinstated female Grp_2       0.696 3.03e-20    232
##  3 NumOfReinstated female Grp_3       0.679 3.81e-19    197
##  4 NumOfReinstated female Grp_4       0.705 3.75e-11     77
##  5 NumOfReinstated male   Grp_1       0.692 1.35e-15    137
##  6 NumOfReinstated male   Grp_2       0.677 9.41e-22    254
##  7 NumOfReinstated male   Grp_3       0.631 6.56e-22    228
##  8 NumOfReinstated male   Grp_4       0.667 5.57e-13     90
##  9 NumOfClaims     female Grp_1       0.598 4.97e-17    126
## 10 NumOfClaims     female Grp_2       0.599 6.72e-23    232
## # … with 54 more rows, and abbreviated variable names
## #   ¹​Occupation, ²​statistic

In summary, given enough data we can run Shapiro_Wilk test for normality - ignore Skewness and Visualizations

Let’s take a look for all of normality test below

bla <- Wage %>%
  filter(education =="1. < HS Grad") %>%
  select(age)

normality(bla) %>% flextable()

vars

statistic

p_value

sample

age

0.986

0.00826

268

shapiro.test(bla$age)
## 
##  Shapiro-Wilk normality test
## 
## data:  bla$age
## W = 1, p-value = 0.008
plot_density(bla) # it looks like Bell-curve of normally distributed.

# But if take a look to histogram plot
plot_histogram(bla)

agostino.test(bla$age) 
## 
##  D'Agostino skewness test
## 
## data:  bla$age
## skew = 0.2, z = 1.6, p-value = 0.1
## alternative hypothesis: data have a skewness
anscombe.test(bla$age)
## 
##  Anscombe-Glynn kurtosis test
## 
## data:  bla$age
## kurt = 3, z = -1, p-value = 0.2
## alternative hypothesis: kurtosis is not equal to 3
ggqqplot(bla$age)

Compare Groups via Box-PLots & Non-parametric test

{DataExplorer}

plot_boxplot(iris, by="Species")

Then the question is “Do these groups differ significantly ?”

{ggstaplot}

With ggbetweenstats() from ggstaplot, we can tell how different among groups

ggbetweenstats(
  data = iris,
  x = Species,
  y = Sepal.Length,
  type = "np"
)

With p_value which tells you whether they are significant differences between groups. They also conducts a correct multiple pairwise comparisons to see between which groups exactly differences are.

Explore Correlations with Different Methods

{dlookr} - Correlation

correlate(airquality,Ozone)
## # A tibble: 5 × 3
##   var1  var2    coef_corr
##   <fct> <fct>       <dbl>
## 1 Ozone Solar.R    0.348 
## 2 Ozone Wind      -0.602 
## 3 Ozone Temp       0.698 
## 4 Ozone Month      0.165 
## 5 Ozone Day       -0.0132
plot_correlate(airquality, method = "kendal")

plot_correlate for: - parametric : with PEARSON - non-parametric : KENDAL OR SPEARMAN

correlate(policy,Premium)
## # A tibble: 7 × 3
##   var1    var2            coef_corr
##   <fct>   <fct>               <dbl>
## 1 Premium NumOfReinstated   0.0602 
## 2 Premium NumOfClaims      -0.0673 
## 3 Premium NumOfEmails      -0.0116 
## 4 Premium NumOfCalls        0.00579
## 5 Premium PO_Age            0.00407
## 6 Premium INS_Age           0.261  
## 7 Premium AgentYearSVR      0.0279
plot_correlate(policy)

{ggstatsplot}

Further to plot the correlation, we may need to test which correlations are actually significant, using ggcormat() function

ggcorrmat(data=iris)

It can help to display the correlation coefficients, colored heatmap showing positive and negative correlations.

## let's use just 5% of the data to speed it up
ggcorrmat(
  data = dplyr::sample_frac(ggplot2::diamonds, size = 0.05),
  cor.vars = c(carat, depth:z), ## note how the variables are getting selected
  cor.vars.names = c(
    "carat",
    "total depth",
    "table",
    "price",
    "length (in mm)",
    "width (in mm)",
    "depth (in mm)"
  ),
  title = "Relationship between diamond attributes and price",
  subtitle = "Dataset: Diamonds from ggplot2 package",
  ggcorrplot.args = list(outline.color = "black", hc.order = TRUE)
)

ggcorrmat(
  data = policy,
  title = "Relationship between Numeric variables",
  subtitle = "Dataset: Policy Lapsed",
)

If any particular correlation you want to view

ggcorrmat(airquality)

ggscatterstats(
  data = airquality,
  x = Ozone,
  y = Temp,
  type = "np" # try the "robust" correlation by using Spearman - as non-parametric
)

ggscatterstats(
  data = airquality,
  x = Ozone,
  y = Temp
)

ggscatterstats(
  iris,
  x = Sepal.Width,
  y = Petal.Length,
  label.var = Species,
  label.expression = Sepal.Length > 7.6
) +
  ggplot2::geom_rug(sides = "b")

ggscatterstats(
  data = policy,
  x = Premium,
  y = INS_Age,
  type = "np" # try the "robust" correlation by using Spearman - as non-parametric
)

{PerformanceAnalytics}

The below is not only display correlation coefficients but also histogram for every particular particular numeric variable and scatter plots for every combination, besides significant stars are particular useful as it describle strenght of correlation

library(PerformanceAnalytics)
chart.Correlation(iris %>%
                    select(-Species),method = "kendall")

chart.Correlation(policy %>%
                    select(Premium,INS_Age,PO_Age), method = "kendall")

{dlookr} - linear models

compare_numeric() function compute information to examine the relationship between numerical variables by pearson’s correlation and simple linear models

bla <- compare_numeric(iris)

bla$correlation
## # A tibble: 6 × 3
##   var1         var2         coef_corr
##   <chr>        <chr>            <dbl>
## 1 Sepal.Length Sepal.Width     -0.118
## 2 Sepal.Length Petal.Length     0.872
## 3 Sepal.Length Petal.Width      0.818
## 4 Sepal.Width  Petal.Length    -0.428
## 5 Sepal.Width  Petal.Width     -0.366
## 6 Petal.Length Petal.Width      0.963
bla$linear %>%
  mutate_if(is.numeric, ~round(.,2)) %>%
  flextable()

var1

var2

r.squared

adj.r.squared

sigma

statistic

p.value

df

logLik

AIC

BIC

deviance

df.residual

nobs

Sepal.Length

Sepal.Width

0.01

0.01

0.83

2.07

0.15

1

-183.0

372

381

100.8

148

150

Sepal.Length

Petal.Length

0.76

0.76

0.41

468.55

0.00

1

-77.0

160

169

24.5

148

150

Sepal.Length

Petal.Width

0.67

0.67

0.48

299.17

0.00

1

-101.1

208

217

33.8

148

150

Sepal.Width

Petal.Length

0.18

0.18

0.40

33.28

0.00

1

-72.6

151

160

23.1

148

150

Sepal.Width

Petal.Width

0.13

0.13

0.41

22.91

0.00

1

-77.0

160

169

24.5

148

150

Petal.Length

Petal.Width

0.93

0.93

0.48

1,882.45

0.00

1

-101.2

208

217

33.8

148

150

We could plot all the results of compare_numeric function which would display the strength of the correlation with the circles, the spread of data with the box plots the linear regression itself.

plot(bla)

plo <- compare_numeric(policy[,c("Premium","INS_Age","PO_Age")])

plo$correlation
## # A tibble: 3 × 3
##   var1    var2    coef_corr
##   <chr>   <chr>       <dbl>
## 1 Premium INS_Age   0.261  
## 2 Premium PO_Age    0.00407
## 3 INS_Age PO_Age    0.0593
plo$linear %>%
  mutate_if(is.numeric, ~round(.,2)) %>%
  flextable()

var1

var2

r.squared

adj.r.squared

sigma

statistic

p.value

df

logLik

AIC

BIC

deviance

df.residual

nobs

Premium

INS_Age

0.07

0.07

2,404.0

97.58

0.00

1

-12,341

24,689

24,704

7,738,399,551

1,339

1,341

Premium

PO_Age

0.00

0.00

2,490.0

0.02

0.88

1

-12,388

24,783

24,799

8,302,200,823

1,339

1,341

INS_Age

PO_Age

0.00

0.00

13.6

4.72

0.03

1

-5,398

10,802

10,818

246,278

1,339

1,341

plot(plo)

Explore data with simple linear & non-linear models

Exploratory modeling

library(ggplot2)
ggplot(iris, aes(Sepal.Length,Sepal.Width))+
  geom_point()+
  geom_smooth()+
  facet_wrap(~Species)

Explore missing values

{dlookr}

The plot_na_intersect() plots the combination variables that is include missing value

plot_na_intersect(airquality)

2 missing variable : Ozone and Solar.2 There 2 values intersect between 2 variables

plot_na_intersect(policy)

imputate_na

imputate_na(airquality,xvar=Ozone,yvar=Temp, method = 'knn') %>%
  plot()

  • xvar: the variable with missing value
  • yvar: the variable which will predict the missing value
imputate_na(policy,NumOfEmails,Lapsed, method = "rpart") %>%
  plot()

Explore Outliers

{performance}

Check_outliers() from performance library

library(performance)
plot(check_outliers(airquality$Wind))

check_outliers(airquality$Wind, method = "iqr") %>%
  plot()

{dlookr}

diagnose_outlier(diamonds) %>% flextable()

variables

outliers_cnt

outliers_ratio

outliers_mean

with_mean

without_mean

carat

1,889

3.5020

2.15

0.798

0.749

depth

2,545

4.7182

61.20

61.749

61.776

table

605

1.1216

64.84

57.457

57.373

price

3,538

6.5591

14,944.78

3,932.800

3,159.807

x

32

0.0593

7.24

5.731

5.730

y

29

0.0538

9.77

5.735

5.732

z

49

0.0908

4.05

3.539

3.538

The diagnose_outlier() provides not only count outliers in variables and percentage, moreover, it averages the mean of outliers, mean of each variable with outliers and without outliers. In this way, we can see how strong the influence of outliers for every variables.

Such as with depth, there are 2,545 outliers but the average of depth are almost identical. But the outliers of price is heavily influence to the average of price

diagnose_outlier(policy) %>% flextable()

variables

outliers_cnt

outliers_ratio

outliers_mean

with_mean

without_mean

NumOfReinstated

123

9.17

3.33

0.676

0.408

NumOfClaims

91

6.79

3.59

0.561

0.340

NumOfEmails

45

3.36

4.49

1.293

1.181

NumOfCalls

0

0.00

1.126

1.126

PO_Age

0

0.00

43.310

43.310

INS_Age

0

0.00

39.894

39.894

Premium

101

7.53

8,948.99

2,825.378

2,326.600

AgentYearSVR

0

0.00

2.111

2.111

airquality %>%
  select(Ozone, Wind) %>%
  plot_outlier()

# Visualize variable with a ratio of outliers greater than 5

plot_outlier(diamonds,
             diamonds %>%
               diagnose_outlier() %>%
               filter(outliers_ratio >5) %>%
               select(variables) %>%
               pull())

Impute outliers

bla <- imputate_outlier(diamonds, carat, method = "capping")
plot(bla)

summary(bla)
## Impute outliers with capping
## 
## * Information of Imputation (before vs after)
##                     Original  Imputation
## described_variables "value"   "value"   
## n                   "53940"   "53940"   
## na                  "0"       "0"       
## mean                "0.798"   "0.782"   
## sd                  "0.474"   "0.432"   
## se_mean             "0.00204" "0.00186" 
## IQR                 "0.64"    "0.64"    
## skewness            "1.117"   "0.699"   
## kurtosis            " 1.257"  "-0.491"  
## p00                 "0.2"     "0.2"     
## p01                 "0.24"    "0.24"    
## p05                 "0.3"     "0.3"     
## p10                 "0.31"    "0.31"    
## p20                 "0.35"    "0.35"    
## p25                 "0.4"     "0.4"     
## p30                 "0.42"    "0.42"    
## p40                 "0.53"    "0.53"    
## p50                 "0.7"     "0.7"     
## p60                 "0.9"     "0.9"     
## p70                 "1.01"    "1.01"    
## p75                 "1.04"    "1.04"    
## p80                 "1.13"    "1.13"    
## p90                 "1.51"    "1.51"    
## p95                 "1.7"     "1.7"     
## p99                 "2.18"    "1.74"    
## p100                "5.01"    "2.00"